import pandas as pd
import altair as alt
jobs_url = "https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json"
jobs = pd.read_json(jobs_url)Homework 5
Your Turn: Wide-long and long-wide
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(jobs).transform_pivot(
'year',
groupby=['job', 'sex'],
value='perc'
).mark_point().encode(
x='1950:Q',
y='2000:Q',
color='sex:N',
tooltip=['sex', '1950:Q', '2000:Q', 'job']
).properties(title="Percentage of men and women in various fields: 1950 vs 2000")Above is my plot showing percentage of men/women in different jobs in 1950 and 2000. One thing I would’ve liked to add here is logarithmic axes, but I was having issues when doing so.
from altair import datum
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "job:N"
).facet(row = "sex:N"
)The plot above shows the percentage of men and women working different jobs over time, using every job in the dataset. This is obviously cluttered…
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "job:N"
).facet(row = "sex:N"
).transform_filter(
(datum.job == "Accountant / Auditor") | (datum.job == "Carpenter")
)This plot shows only accountants/auditors and carpenters, which simplifies the graph significantly, although it does show less info.
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "sex:N"
).transform_filter(
(datum.job == "Accountant / Auditor")
)This graph shows only the allocation for accountant/auditors (my future career) over time, using color to differentiate sexes.
Your turn: Maps
gapminder_url = "https://cdn.jsdelivr.net/npm/vega-datasets@1.29.0/data/gapminder.json"
gapminder = pd.read_json(gapminder_url)
print(gapminder[0:4]) year country cluster pop life_expect fertility
0 1955 Afghanistan 0 8891209 30.332 7.7
1 1960 Afghanistan 0 9829450 31.997 7.7
2 1965 Afghanistan 0 10997885 34.020 7.7
3 1970 Afghanistan 0 12430623 36.088 7.7
import json
from urllib.request import urlopen
world2_url = 'https://cdn.jsdelivr.net/npm/world-atlas@2/countries-110m.json'
world2 = json.load(urlopen(world2_url))
country_names = [ p['properties']['name']
for p in world2['objects']['countries']['geometries'] ]
common_names = list(set(country_names) & set(gapminder['country']))
missing_names = list(set(gapminder['country']) - set(country_names))
extra_names = list(set(country_names) - set(gapminder['country']))
# names in gapminder and in map data
print("in common:", len(common_names), common_names)
# names in gapminder but not in map data
print("missing in map:", len(missing_names), missing_names)
# names in the map data but not in gapminder
print("extra in map", len(extra_names), extra_names)in common: 57 ['France', 'Lebanon', 'Israel', 'Haiti', 'Philippines', 'Turkey', 'Jamaica', 'North Korea', 'Spain', 'Nigeria', 'Pakistan', 'South Africa', 'Iraq', 'Peru', 'Austria', 'Egypt', 'Iran', 'Mexico', 'United Kingdom', 'Bangladesh', 'India', 'Germany', 'Ireland', 'Japan', 'Ecuador', 'Rwanda', 'Venezuela', 'China', 'El Salvador', 'Iceland', 'Kenya', 'Finland', 'Norway', 'Australia', 'Canada', 'Belgium', 'Greece', 'Italy', 'Colombia', 'Indonesia', 'Chile', 'South Korea', 'Poland', 'Brazil', 'Georgia', 'Saudi Arabia', 'Afghanistan', 'Bolivia', 'Croatia', 'Argentina', 'Portugal', 'Bahamas', 'Netherlands', 'Cuba', 'Costa Rica', 'New Zealand', 'Switzerland']
missing in map: 6 ['Barbados', 'Aruba', 'Hong Kong', 'United States', 'Dominican Republic', 'Grenada']
extra in map 120 ['Zambia', 'Serbia', 'Mozambique', 'Macedonia', 'Bosnia and Herz.', 'Turkmenistan', 'Tanzania', 'W. Sahara', "Côte d'Ivoire", 'Nepal', 'Yemen', 'Uganda', 'Burundi', 'Gabon', 'Romania', 'Greenland', 'Oman', 'Kuwait', 'Sri Lanka', 'Estonia', 'Kosovo', 'Cyprus', 'Lesotho', 'Qatar', 'Latvia', 'Guatemala', 'Nicaragua', 'Mali', 'Slovenia', 'Niger', 'Jordan', 'Albania', 'Senegal', 'Russia', 'Tunisia', 'Sweden', 'Uruguay', 'Ethiopia', 'Thailand', 'Azerbaijan', 'Fr. S. Antarctic Lands', 'Uzbekistan', 'Mongolia', 'Armenia', 'Guinea', 'Montenegro', 'Botswana', 'United States of America', 'Denmark', 'New Caledonia', 'Bhutan', 'Syria', 'Belarus', 'Togo', 'Belize', 'Mauritania', 'Congo', 'Cambodia', 'Vanuatu', 'Malawi', 'Kyrgyzstan', 'Suriname', 'Zimbabwe', 'Falkland Is.', 'Sierra Leone', 'Guinea-Bissau', 'Algeria', 'Taiwan', 'Czechia', 'United Arab Emirates', 'Timor-Leste', 'Dominican Rep.', 'Panama', 'Myanmar', 'Paraguay', 'Brunei', 'Madagascar', 'Ukraine', 'N. Cyprus', 'Dem. Rep. Congo', 'Gambia', 'Libya', 'Liberia', 'Laos', 'Djibouti', 'Trinidad and Tobago', 'Vietnam', 'Central African Rep.', 'Eq. Guinea', 'Solomon Is.', 'Chad', 'Tajikistan', 'Morocco', 'Sudan', 'eSwatini', 'Papua New Guinea', 'Somalia', 'Lithuania', 'Luxembourg', 'Burkina Faso', 'Antarctica', 'Honduras', 'Guyana', 'Cameroon', 'Angola', 'Somaliland', 'Malaysia', 'Palestine', 'S. Sudan', 'Eritrea', 'Benin', 'Ghana', 'Kazakhstan', 'Moldova', 'Puerto Rico', 'Fiji', 'Namibia', 'Bulgaria', 'Hungary', 'Slovakia']
world = alt.topo_feature(world2_url, feature = 'countries')
world_map = alt.Chart(world).mark_geoshape(
).properties(
width=800,
height=400
).transform_lookup(
lookup="properties.name",
from_= alt.LookupData(gapminder, 'country', ['life_expect', 'year'])
).encode(
fill='life_expect:Q'
)
world_mapI’ve tried to aggregate and show just one year, but with no luck. I’m also not sure why there’s no legend.
from vega_datasets import data
airports = data.airports()
states = alt.topo_feature(data.us_10m.url, feature = 'states')
state_map = alt.Chart(states).mark_geoshape(
fill = 'transparent',
stroke = 'steelblue'
).project('albersUsa'
).transform_lookup(
lookup = 'state',
from_ = alt.LookupData(airports, 'state', ['name'])
).encode(
fill = 'count()'
)
state_map.properties(width = 500, height = 300)For some reason the ‘fill’ and the projection is not working. Probably an issue with the lookup.
Visualization galleries
I saw a graphic on the Reddit page of NFL receiving yards leaders in 2023, but adjusted for Pass Interference calls. The story here is that if DPI yards were included in total yards, the leaderboard changes dramatically. It’ a good graphic because it makes use of so many features. It has bars for each players actual receiving yards, another bar for DPI yards (which is a different color), text showing the actual number of actual, DPI, and adjusted total yards, and arrows showing the shift in ranking due to the adjustment. I know how to implement the stacked bars, text, x and y axes labels, and the colors. I don’t know how to implement the arrows showing the change in ranking on the leaderboard.